import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('./data/netflix_titles.csv')
# Function to extract numeric value from duration
def extract_duration(duration):
if isinstance(duration, float):
return duration
else:
try:
return int(duration.split()[0])
except ValueError:
return None
# Convert duration to numeric
df['duration_numeric'] = df['duration'].apply(extract_duration)
# Separate TV shows and movies
current_year = pd.Timestamp.now().year
last_20_years_df = df[df['release_year'] >= current_year - 20]
# Separate TV shows and movies
tv_shows_df = last_20_years_df[last_20_years_df['type'] == 'TV Show']
movies_df = last_20_years_df[last_20_years_df['type'] == 'Movie']
# Plotting TV shows
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(tv_shows_df['release_year'], tv_shows_df['duration_numeric'])
plt.xlabel('Release Year')
plt.ylabel('Duration (Seasons)')
plt.title('TV Shows (Last 20 Years)')
# Plotting movies
plt.subplot(1, 2, 2)
plt.scatter(movies_df['release_year'], movies_df['duration_numeric'])
plt.xlabel('Release Year')
plt.ylabel('Duration (Minutes)')
plt.title('Movies (Last 20 Years)')
plt.tight_layout()
plt.show()
import plotly.express as px
# fig = px.scatter(df, x='release_year', y='country')
# fig.update_layout(title='Explore Data', xaxis_title='Release Year', yaxis_title='Country release' )
# fig.show()
fig = px.scatter(df, x='release_year', y='country', title='Explore Data',
labels={'release_year': 'Release Year', 'country': 'Country release'}, hover_data={'title': False},
hover_name='title')
# Update layout
fig.update_layout(title='Explore Data', xaxis_title='Release Year', yaxis_title='Country release')
# Show plot
fig.show()
# Hover on the plot to get the options to zoom
from ipywidgets import interact, Dropdown
# Create a list of unique directors
directors = df['director'].unique()
# Set the default director
default_director = "Spike Lee"
def plot_movies_by_director(selected_director=default_director):
if selected_director is None:
return
movies = df[df['director'] == selected_director][['title', 'release_year']]
# Plotting the count of movies by director
plt.figure(figsize=(16, 8))
ax1 = plt.subplot(2, 1, 1)
movies_count = len(movies)
ax1.bar(selected_director, movies_count, color='skyblue')
ax1.set_ylabel('Number of Movies')
ax1.set_title(f"Number of Movies Directed by {selected_director}")
# Plotting the release years of movies by director
ax2 = plt.subplot(2, 1, 2)
ax2.plot(movies['title'], movies['release_year'], marker='o', color='orange', linestyle='-')
ax2.set_ylabel('Release Year')
ax2.set_xlabel('Movie Title')
ax2.set_title(f"Release Years of Movies Directed by {selected_director}")
ax2.grid(True)
ax2.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# Create interactive dropdown menu
interact(plot_movies_by_director, selected_director=Dropdown(options=directors, value=default_director))
interactive(children=(Dropdown(description='selected_director', index=81, options=('Kirsten Johnson', nan, 'Ju…
<function __main__.plot_movies_by_director(selected_director='Spike Lee')>
from wordcloud import WordCloud
# Filter out NaN values and split directors by comm
# Path to a TrueType font file
font_path = "./data/Lato-Regular.ttf"
# Directors data
directors = df['director'].dropna().str.split(', ')
# Calculate the count of movies directed by each director
director_counts = directors.explode().value_counts().to_dict()
# Generate the word cloud with count in the label
text = ' '.join([f"{director} ({director_counts[director]})" for director_list in directors for director in director_list])
wordcloud = WordCloud(width=800, height=400, background_color='white', font_path=font_path).generate(text)
# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# Calculate country counts for TV shows and movies
tv_shows = df[df['type'] == 'TV Show']
movies = df[df['type'] == 'Movie']
tv_shows_country_counts = tv_shows['country'].value_counts(normalize=True) * 100
movies_country_counts = movies['country'].value_counts(normalize=True) * 100
# Function to plot pie chart with custom label
def plot_pie_chart(country_counts, title):
threshold = 5
other_countries = country_counts[country_counts < threshold]
major_countries = country_counts[country_counts >= threshold]
major_countries['Other'] = other_countries.sum()
plt.figure()
major_countries.plot.pie(autopct=lambda p: '{:.1f}%'.format(p) if p >= threshold else '', label='', colors=plt.cm.tab20.colors)
plt.title(title)
plt.ylabel('')
plt.show()
# Plotting pie charts
plot_pie_chart(tv_shows_country_counts, 'TV Shows by Country')
plot_pie_chart(movies_country_counts, 'Movies by Country')
import plotly.graph_objects as go
fig = go.Figure(data=[go.Scatter(x=df['release_year'], y=df['duration'], mode='markers',
text=df['title'], marker=dict(size=8))])
fig.update_layout(title='Elaborate with Tooltips', xaxis_title='Release Year', yaxis_title='Duration')
fig.show()
# Split directors and count the occurrences
tv_directors = directors[df['type'] == 'TV Show']
movie_directors = directors[df['type'] == 'Movie']
# Calculate the count of movies directed by each director for TV shows and movies
tv_director_counts = tv_directors.explode().value_counts()
movie_director_counts = movie_directors.explode().value_counts()
# Filter directors with at least 10 occurrences
filtered_tv_directors = tv_director_counts[tv_director_counts >= 2]
filtered_movie_directors = movie_director_counts[movie_director_counts >= 10]
# Plotting histograms
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
axs[0].bar(filtered_tv_directors.index, filtered_tv_directors.values, color='skyblue')
axs[0].set_xlabel('Director')
axs[0].set_ylabel('Frequency')
axs[0].set_title('Frequency of Directors in TV Shows (>= 2 occurrences)')
axs[0].tick_params(axis='x', rotation=45)
axs[1].bar(filtered_movie_directors.index, filtered_movie_directors.values, color='salmon')
axs[1].set_xlabel('Director')
axs[1].set_ylabel('Frequency')
axs[1].set_title('Frequency of Directors in Movies (>= 10 occurrences)')
axs[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
Reference Used: https://medium.com/codex/how-to-automatically-generate-data-structure-for-sankey-diagrams-6082e332139f
import plotly.graph_objects as go
import pandas as pd
df = pd.read_csv('./data/netflix_titles.csv')
def map_year_to_decade(year):
return str(year // 10 * 10) + 's'
# Apply the function to create the new column
df['Decade'] = df['release_year'].apply(map_year_to_decade)
def data_snakey(data, path, value_col):
sankey_data = {
'label': [],
'source': [],
'target': [],
'value': []
}
counter = 0
while (counter < len(path) - 1):
for parent in data[path[counter]].unique():
sankey_data['label'].append(str(parent)) # Convert to string
for sub in data[data[path[counter]] == parent][path[counter + 1]].unique():
sankey_data['source'].append(sankey_data['label'].index(str(parent))) # Convert to string
sankey_data['label'].append(str(sub)) # Convert to string
sankey_data['target'].append(sankey_data['label'].index(str(sub))) # Convert to string
sankey_data['value'].append(str(data[data[path[counter + 1]] == sub][value_col].sum())) # Convert to string
counter += 1
return sankey_data
con_data = data_snakey(df, [ 'type', 'rating', 'Decade'], 'release_year')
df
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = con_data['label'],
),
link = dict(
source = con_data['source'],
target = con_data['target'],
value = con_data['value']
))
])
fig.update_layout(height=700,margin={'t':0,'b':0})